This compiled dataset pulled from four other datasets linked by time and place, and was built to find signals correlated to increased suicide rates among different cohorts globally, across the socio-economic spectrum.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import preprocessing
sns.set(style='darkgrid')
df = pd.read_csv(r"D:\Misc\Suicide_Rates_Overview_1985_to_2016.csv")
df.head(10)
| country | year | sex | age | suicides_no | population | suicides/100k pop | country-year | HDI for year | gdp_for_year ($) | gdp_per_capita ($) | generation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 1987 | male | 15-24 years | 21 | 312900 | 6.71 | Albania1987 | NaN | 2,156,624,900 | 796 | Generation X |
| 1 | Albania | 1987 | male | 35-54 years | 16 | 308000 | 5.19 | Albania1987 | NaN | 2,156,624,900 | 796 | Silent |
| 2 | Albania | 1987 | female | 15-24 years | 14 | 289700 | 4.83 | Albania1987 | NaN | 2,156,624,900 | 796 | Generation X |
| 3 | Albania | 1987 | male | 75+ years | 1 | 21800 | 4.59 | Albania1987 | NaN | 2,156,624,900 | 796 | G.I. Generation |
| 4 | Albania | 1987 | male | 25-34 years | 9 | 274300 | 3.28 | Albania1987 | NaN | 2,156,624,900 | 796 | Boomers |
| 5 | Albania | 1987 | female | 75+ years | 1 | 35600 | 2.81 | Albania1987 | NaN | 2,156,624,900 | 796 | G.I. Generation |
| 6 | Albania | 1987 | female | 35-54 years | 6 | 278800 | 2.15 | Albania1987 | NaN | 2,156,624,900 | 796 | Silent |
| 7 | Albania | 1987 | female | 25-34 years | 4 | 257200 | 1.56 | Albania1987 | NaN | 2,156,624,900 | 796 | Boomers |
| 8 | Albania | 1987 | male | 55-74 years | 1 | 137500 | 0.73 | Albania1987 | NaN | 2,156,624,900 | 796 | G.I. Generation |
| 9 | Albania | 1987 | female | 5-14 years | 0 | 311000 | 0.00 | Albania1987 | NaN | 2,156,624,900 | 796 | Generation X |
df.tail(10)
| country | year | sex | age | suicides_no | population | suicides/100k pop | country_year | HDI for year | gdp_for_year ($) | gdp_per_capita | generation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27810 | Uzbekistan | 2014 | female | 15-24 years | 347 | 2992817 | 11.59 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Millenials |
| 27811 | Uzbekistan | 2014 | male | 55-74 years | 144 | 1271111 | 11.33 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Boomers |
| 27812 | Uzbekistan | 2014 | male | 15-24 years | 347 | 3126905 | 11.10 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Millenials |
| 27813 | Uzbekistan | 2014 | male | 75+ years | 17 | 224995 | 7.56 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Silent |
| 27814 | Uzbekistan | 2014 | female | 25-34 years | 162 | 2735238 | 5.92 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Millenials |
| 27815 | Uzbekistan | 2014 | female | 35-54 years | 107 | 3620833 | 2.96 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Generation X |
| 27816 | Uzbekistan | 2014 | female | 75+ years | 9 | 348465 | 2.58 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Silent |
| 27817 | Uzbekistan | 2014 | male | 5-14 years | 60 | 2762158 | 2.17 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Generation Z |
| 27818 | Uzbekistan | 2014 | female | 5-14 years | 44 | 2631600 | 1.67 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Generation Z |
| 27819 | Uzbekistan | 2014 | female | 55-74 years | 21 | 1438935 | 1.46 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Boomers |
df.rename({' gdp_for_year ($ ) ' : 'gdp_for_year', 'gdp_per_capita ($)' : 'gdp_per_capita', 'country-year' : 'country_year'}, axis=1, inplace=True)
df
| country | year | sex | age | suicides_no | population | suicides/100k pop | country_year | HDI for year | gdp_for_year ($) | gdp_per_capita | generation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 1987 | male | 15-24 years | 21 | 312900 | 6.71 | Albania1987 | NaN | 2,156,624,900 | 796 | Generation X |
| 1 | Albania | 1987 | male | 35-54 years | 16 | 308000 | 5.19 | Albania1987 | NaN | 2,156,624,900 | 796 | Silent |
| 2 | Albania | 1987 | female | 15-24 years | 14 | 289700 | 4.83 | Albania1987 | NaN | 2,156,624,900 | 796 | Generation X |
| 3 | Albania | 1987 | male | 75+ years | 1 | 21800 | 4.59 | Albania1987 | NaN | 2,156,624,900 | 796 | G.I. Generation |
| 4 | Albania | 1987 | male | 25-34 years | 9 | 274300 | 3.28 | Albania1987 | NaN | 2,156,624,900 | 796 | Boomers |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27815 | Uzbekistan | 2014 | female | 35-54 years | 107 | 3620833 | 2.96 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Generation X |
| 27816 | Uzbekistan | 2014 | female | 75+ years | 9 | 348465 | 2.58 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Silent |
| 27817 | Uzbekistan | 2014 | male | 5-14 years | 60 | 2762158 | 2.17 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Generation Z |
| 27818 | Uzbekistan | 2014 | female | 5-14 years | 44 | 2631600 | 1.67 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Generation Z |
| 27819 | Uzbekistan | 2014 | female | 55-74 years | 21 | 1438935 | 1.46 | Uzbekistan2014 | 0.675 | 63,067,077,179 | 2309 | Boomers |
27820 rows × 12 columns
df.columns
Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
'suicides/100k pop', 'country_year', 'HDI for year',
' gdp_for_year ($) ', 'gdp_per_capita', 'generation'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 27820 entries, 0 to 27819 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 27820 non-null object 1 year 27820 non-null int64 2 sex 27820 non-null object 3 age 27820 non-null object 4 suicides_no 27820 non-null int64 5 population 27820 non-null int64 6 suicides/100k pop 27820 non-null float64 7 country_year 27820 non-null object 8 HDI for year 8364 non-null float64 9 gdp_for_year ($) 27820 non-null object 10 gdp_per_capita 27820 non-null int64 11 generation 27820 non-null object dtypes: float64(2), int64(4), object(6) memory usage: 2.5+ MB
df.describe()
| year | suicides_no | population | suicides/100k pop | HDI for year | gdp_per_capita | |
|---|---|---|---|---|---|---|
| count | 27820.000000 | 27820.000000 | 2.782000e+04 | 27820.000000 | 8364.000000 | 27820.000000 |
| mean | 2001.258375 | 242.574407 | 1.844794e+06 | 12.816097 | 0.776601 | 16866.464414 |
| std | 8.469055 | 902.047917 | 3.911779e+06 | 18.961511 | 0.093367 | 18887.576472 |
| min | 1985.000000 | 0.000000 | 2.780000e+02 | 0.000000 | 0.483000 | 251.000000 |
| 25% | 1995.000000 | 3.000000 | 9.749850e+04 | 0.920000 | 0.713000 | 3447.000000 |
| 50% | 2002.000000 | 25.000000 | 4.301500e+05 | 5.990000 | 0.779000 | 9372.000000 |
| 75% | 2008.000000 | 131.000000 | 1.486143e+06 | 16.620000 | 0.855000 | 24874.000000 |
| max | 2016.000000 | 22338.000000 | 4.380521e+07 | 224.970000 | 0.944000 | 126352.000000 |
df.corr()
| year | suicides_no | population | suicides/100k pop | HDI for year | gdp_per_capita | |
|---|---|---|---|---|---|---|
| year | 1.000000 | -0.004546 | 0.008850 | -0.039037 | 0.366786 | 0.339134 |
| suicides_no | -0.004546 | 1.000000 | 0.616162 | 0.306604 | 0.151399 | 0.061330 |
| population | 0.008850 | 0.616162 | 1.000000 | 0.008285 | 0.102943 | 0.081510 |
| suicides/100k pop | -0.039037 | 0.306604 | 0.008285 | 1.000000 | 0.074279 | 0.001785 |
| HDI for year | 0.366786 | 0.151399 | 0.102943 | 0.074279 | 1.000000 | 0.771228 |
| gdp_per_capita | 0.339134 | 0.061330 | 0.081510 | 0.001785 | 0.771228 | 1.000000 |
Plot the correlation
plt.figure(figsize=(16,6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax= 1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);
df.age.unique()
array(['15-24 years', '35-54 years', '75+ years', '25-34 years',
'55-74 years', '5-14 years'], dtype=object)
df.suicides_no.value_counts()
0 4281
1 1539
2 1102
3 867
4 696
...
2158 1
525 1
2297 1
5241 1
2872 1
Name: suicides_no, Length: 2084, dtype: int64
missing_percentages = df.isna().sum().sort_values(ascending= False) / len(df)
missing_percentages
HDI for year 0.699353 country 0.000000 year 0.000000 sex 0.000000 age 0.000000 suicides_no 0.000000 population 0.000000 suicides/100k pop 0.000000 country_year 0.000000 gdp_for_year ($) 0.000000 gdp_per_capita 0.000000 generation 0.000000 dtype: float64
Missing percentages
missing_percentages[missing_percentages != 0].plot(kind = 'barh')
<AxesSubplot:>
Delete the HDI for year column since it has many missing values
del df['HDI for year']
df.columns
Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
'suicides/100k pop', 'country_year', ' gdp_for_year ($) ',
'gdp_per_capita', 'generation'],
dtype='object')
Unique Countries
countries =df.country.unique()
countries
array(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
'Barbados', 'Belarus', 'Belgium', 'Belize',
'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Cabo Verde',
'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Cuba',
'Cyprus', 'Czech Republic', 'Denmark', 'Dominica', 'Ecuador',
'El Salvador', 'Estonia', 'Fiji', 'Finland', 'France', 'Georgia',
'Germany', 'Greece', 'Grenada', 'Guatemala', 'Guyana', 'Hungary',
'Iceland', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan',
'Kazakhstan', 'Kiribati', 'Kuwait', 'Kyrgyzstan', 'Latvia',
'Lithuania', 'Luxembourg', 'Macau', 'Maldives', 'Malta',
'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Netherlands',
'New Zealand', 'Nicaragua', 'Norway', 'Oman', 'Panama', 'Paraguay',
'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar',
'Republic of Korea', 'Romania', 'Russian Federation',
'Saint Kitts and Nevis', 'Saint Lucia',
'Saint Vincent and Grenadines', 'San Marino', 'Serbia',
'Seychelles', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa',
'Spain', 'Sri Lanka', 'Suriname', 'Sweden', 'Switzerland',
'Thailand', 'Trinidad and Tobago', 'Turkey', 'Turkmenistan',
'Ukraine', 'United Arab Emirates', 'United Kingdom',
'United States', 'Uruguay', 'Uzbekistan'], dtype=object)
Age Count
age_count = df.age
age_count
0 15-24 years
1 35-54 years
2 15-24 years
3 75+ years
4 25-34 years
...
27815 35-54 years
27816 75+ years
27817 5-14 years
27818 5-14 years
27819 55-74 years
Name: age, Length: 27820, dtype: object
suicides_per_100k = (df['suicides/100k pop'].sum()) / len(df['suicides/100k pop'])
suicides_per_100k
12.816097411933864
Suicides per 100K population is approx 12.82
suicides_number = df.suicides_no[:30]
sns.histplot(data = suicides_number, kde=True, bins = 20)
plt.show()
px.histogram(suicides_number)
Sex count
px.pie(df, names='sex')
fig = px.histogram(
df,
x='sex',
y='population'
)
fig.show()
px.histogram(
df, x='sex',
y='suicides_no',
title='SUICIDE BY GENDER',
labels={'sex': 'Gender', 'suicides_no': 'Suicides Number'}
)
px.histogram(
df, x='sex',
y='suicides/100k pop',
title='SUICIDES PER 100K POPULATION BY GENDER',
labels={'sex': 'Gender'}
)
fig = px.histogram(
df,
x='gdp_per_capita',
y='suicides/100k pop',
color='sex',
marginal='box',
height=600,
title='SUICIDES BY GDP PER CAPITA AND SEX',
labels={'gdp_per_capita': 'GDP per Capita'}
)
fig.show()
fig = px.histogram(
df,
x='year',
y='suicides/100k pop',
height=500,
marginal='box',
title='SUICIDES BY YEAR AND GENDER',
labels={'year': 'Year'}
)
#fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(
df,
x='year',
y='suicides_no',
color='sex',
height=500,
marginal='box',
title='SUICIDES BY YEAR AND GENDER',
labels={'year': 'Year'}
)
#fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(
df,
x='year',
y='suicides/100k pop',
color='sex',
height=600,
marginal='box',
title='SUICIDES PER 100K BY YEAR AND GENDER',
labels={'year': 'Year'}
)
#fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(df,
x='age',
y='suicides_no',
color='sex',
title='SUICIDES BY AGE AND SEX',
height=500,
labels={'age': 'Age'}
)
fig.show()
fig = px.histogram(df,
x='age',
y='suicides/100k pop',
color='sex',
title='SUICIDES PER 100K BY AGE AND SEX',
height=500,
labels={'age': 'Age'}
)
fig.show()
fig = px.histogram(
df,
x='generation',
y='suicides_no',
color='sex',
height=500,
title='SUICIDES BY GENERATION AND GENDER',
labels={'generation': 'Generation'}
)
fig.show()
fig = px.histogram(
df,
x='generation',
y='suicides/100k pop',
color='sex',
height=500,
title='SUICIDES PER 100K BY GENERATION AND GENDER',
labels={'generation': 'Generation'}
)
fig.show()
fig = px.histogram(
df,
x='year',
y='suicides/100k pop',
color='generation',
height=600,
marginal='box',
title='SUICIDES PER 100K BY YEAR AND GENDER',
labels={'year': 'Year'}
)
#fig.update_layout(bargap=0.1)
fig.show()
fig = px.scatter(
df,
x='year',
y='suicides/100k pop',
color='sex',
height=500
)
fig.show()
fig = px.histogram(
df,
x='country',
y='suicides/100k pop',
color='sex',
height=600,
width=1050,
title='SUICIDES PER 100K BY COUNTRY '
)
fig.show()
fig = px.scatter(
df,
x='country',
y='suicides/100k pop',
color='generation',
height=600,
title='SUICIDES PER 100K BY COUNTRY ',
)
fig.show()
fig = px.histogram(
df,
x='country',
y='suicides/100k pop',
color='age',
height=600,
title='SUICIDES PER 100K BY COUNTRY '
)
fig.show()
df[' gdp_for_year ($) ']
0 2,156,624,900
1 2,156,624,900
2 2,156,624,900
3 2,156,624,900
4 2,156,624,900
...
27815 63,067,077,179
27816 63,067,077,179
27817 63,067,077,179
27818 63,067,077,179
27819 63,067,077,179
Name: gdp_for_year ($) , Length: 27820, dtype: object
fig = px.scatter(df.query("year==1995"),
x="gdp_per_capita",
y="suicides/100k pop",
size="population",
color="generation",
hover_name="country",
log_x=True,
size_max=60,
height=500
)
fig.show()
fig = px.scatter(df.query("year==2007"),
x="gdp_per_capita",
y="suicides/100k pop",
size="population",
color="sex",
hover_name="country",
log_x=True,
size_max=60,
height=500
)
fig.show()
fig = px.scatter(df.query("year==1995"),
y="suicides_no",
x="gdp_per_capita",
size="suicides/100k pop",
color="generation",
hover_name="country",
log_x=True,
size_max=60,
height=500,
title='SUICIDES BY GDP PER CAPITA'
)
fig.show()
fig = px.histogram(
df,
x='year',
y='gdp_per_capita',
height=500,
title='GDP PER CAPITA BY YEAR'
)
fig.show()
fig = px.histogram(
df,
y='suicides/100k pop',
x='gdp_per_capita',
color='sex',
marginal='box',
height=600
)
fig.show()
fig = px.histogram(
df,
x='country',
y='suicides/100k pop',
color='sex',
height=600,
#marginal='box',
title='SUICIDES PER 100K BY YEAR AND GENDER',
labels={'year': 'Year'}
)
#fig.update_layout(bargap=0.1)
fig.show()
df.columns
Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
'suicides/100k pop', 'country_year', ' gdp_for_year ($) ',
'gdp_per_capita', 'generation'],
dtype='object')
plt.figure(figsize=(18,6))
ax = sns.lineplot(x='country', hue='age', y="suicides_no", data=df)
plt.xlabel("Country", fontsize = 14)
plt.ylabel("Suicide No", fontsize = 14)
plt.xticks(rotation=90)
plt.xlim(0, )
plt.title("Suicide by Country and Age", fontsize = 16, font = 'Roboto')
Text(0.5, 1.0, 'Suicide by Country and Age')
plt.figure(figsize=(18,6))
ax = sns.lineplot(x='country', hue='age', y="suicides/100k pop", data=df, errorbar=None)
plt.xlabel("Country", fontsize = 14)
plt.ylabel("Suicide per 100K", fontsize = 14)
plt.xticks(rotation=90)
plt.xlim(0, )
plt.title("Suicide per 100K by Country and Age", fontsize = 16, font = 'Roboto')
Text(0.5, 1.0, 'Suicide per 100K by Country and Age')